In [1]:
from __future__ import print_function
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
print(__doc__)
# Loading the Digits dataset
digits = datasets.load_digits()
# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
# Set the parameters by cross-validation
tuned_parameters = {'kernel': ['rbf'], 'gamma': 10. ** np.arange(-3, 3),
'C': 10. ** np.arange(-3, 3)}
scores = ['accuracy']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
scoring='%s' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in clf.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.
In [2]:
import numpy as np
scores = np.array([x[1] for x in clf.grid_scores_])
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.matshow(scores.reshape(6, 6))
Out[3]:
In [4]:
from sklearn import datasets
In [5]:
from sklearn.datasets import make_blobs
In [6]:
X, y = make_blobs(centers=2, random_state=4)
In [7]:
plt.scatter(X[:, 0], X[:, 1], c=np.array(['red', 'blue'])[y])
y_noisy = y.copy()
In [8]:
y_noisy[X[:, 0] > 11.3] = 1
In [9]:
plt.scatter(X[:, 0], X[:, 1], c=np.array(['red', 'blue'])[y_noisy])
Out[9]:
In [9]:
In [10]:
plt.scatter(X[:, 0], X[:, 1], c=y_noisy, s=50)
for i, x in enumerate(X):
plt.text(x[0], x[1], i)
In [11]:
from sklearn.svm import LinearSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
X, y = make_blobs(centers=2, random_state=4, n_samples=30)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
plt.figure(figsize=(10, 8))
y_noisy = y.copy()
y_noisy[7] = 0
y_noisy[27] = 0
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
for ax, C in zip(axes, [1e-2, 1, 1e2]):
ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y_noisy])
svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y_noisy)
#svm = LinearSVC(C=C, tol=0.00001, dual=False).fit(X, y_noisy)
w = svm.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(6, 13)
yy = a * xx - (svm.intercept_[0]) / w[1]
ax.plot(xx, yy, label="C = %.e" % C, c='k')
#ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['b', 'r'])[y_noisy])
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(());
ax.set_title("C = %f" % C)
In [12]:
from sklearn.tree import DecisionTreeClassifier
In [13]:
tree = DecisionTreeClassifier().fit(X, y)
In [72]:
from sklearn.externals.six import StringIO # doctest: +SKIP
import pydot # doctest: +SKIP
from sklearn.tree import export_graphviz
from scipy.misc import imread
import re
def tree_image(tree, fout=None):
dot_data = StringIO() # doctest: +SKIP
export_graphviz(tree, out_file=dot_data) # doctest: +SKIP
data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue())
data = re.sub(r"samples = [0-9]+\\n", "", data)
data = re.sub(r"\\nsamples = [0-9]+", "", data)
graph = pydot.graph_from_dot_data(data) # doctest: +SKIP
#graph.write_pdf("iris.pdf") # doctest: +SKIP
if fout is None:
fout = "tmp.png"
graph.write_png(fout)
return imread(fout)
In [62]:
In [71]:
X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
from scipy import ndimage
fig, axes = plt.subplots(2, 6, figsize=(30, 8))
h = 0.02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
for ax, max_depth in zip(axes.T, [0, 1, 2, 3, 5, 20]):
if max_depth != 0:
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
faces = faces.reshape(xx.shape)
border = ndimage.laplace(faces) != 0
ax[0].contourf(xx, yy, Z, alpha=.4)
ax[0].scatter(xx[border], yy[border], marker='.', s=1)
ax[0].set_title("max_depth = %d" % max_depth)
ax[1].imshow(tree_image(tree))
ax[1].axis("off")
ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
ax[0].set_xlim(x_min, x_max)
ax[0].set_ylim(y_min, y_max)
ax[0].set_xticks(())
ax[0].set_yticks(())
axes[1, 0].set_visible(False)
In [75]:
X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
from scipy import ndimage
h = 0.02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
for max_depth in [0, 1, 2, 3, 5, 20]:
plt.figure()
ax0 = plt.gca()
if max_depth != 0:
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
faces = faces.reshape(xx.shape)
border = ndimage.laplace(faces) != 0
ax0.contourf(xx, yy, Z, alpha=.4)
ax0.scatter(xx[border], yy[border], marker='.', s=1)
ax0.set_title("max_depth = %d" % max_depth)
tree_image(tree, "tree_graph_max_depth_%d.png" % max_depth)
ax0.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
ax0.set_xlim(x_min, x_max)
ax0.set_ylim(y_min, y_max)
ax0.set_xticks(())
ax0.set_yticks(())
plt.savefig("tree_max_depth_%d.png" % max_depth, bbox_inches='tight')
In [86]:
from sklearn.ensemble import RandomForestClassifier
X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
fig, axes = plt.subplots(2, 4, figsize=(12, 5))
for ax, max_depth in zip(axes.T, [1, 2, 3, 5]):
tree = RandomForestClassifier(max_depth=max_depth, random_state=1, n_estimators=50, max_features=1).fit(X, y)
h = 0.02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
for i, est in enumerate(tree.estimators_):
if i > 10: break
tree_image(est, "rf_max_depth_%d_tree_%d.png" % (max_depth, i))
#faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
#faces = faces.reshape(xx.shape)
#border = ndimage.laplace(faces) != 0
ax[1].contourf(xx, yy, Z, alpha=.8)
#ax.scatter(xx[border], yy[border], marker='.', s=1)
ax[1].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
ax[1].set_xlim(x_min, x_max)
ax[1].set_ylim(y_min, y_max)
ax[1].set_xticks(())
ax[1].set_yticks(())
ax[0].set_title("max_depth: %d" % max_depth)
ax[0].contour(xx, yy, Z, levels=[0.5], c='k')
ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
ax[0].set_xlim(x_min, x_max)
ax[0].set_ylim(y_min, y_max)
ax[0].set_xticks(())
ax[0].set_yticks(())
In [18]:
plt.scatter(X[:, 0], X[:, 1], c=y_noisy, s=50)
for i, x in enumerate(X):
plt.text(x[0], x[1], i, fontsize=20)
In [19]:
np.bincount(y_noisy)
Out[19]:
In [19]:
In [20]:
from sklearn.svm import LinearSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
X, y = make_blobs(centers=2, random_state=4, n_samples=30)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, axes = plt.subplots(1, 4, figsize=(15, 3))
y_noisy = y.copy()
y_noisy[7] = 0
y_noisy[27] = 0
# restore balance
mask = np.ones(len(X), dtype=np.bool)
mask[np.array([0, 1, 5, 26])] = 0
X = X[mask]
y_noisy = y_noisy[mask]
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
for ax, C in zip(axes, [1e0, 5, 10, 100]):
svm = SVC(gamma=.1, kernel='rbf', C=C, tol=0.00001).fit(X, y_noisy)
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
print(svm.score(X, y_noisy))
Z = Z.reshape(xx.shape)
ax.contour(xx, yy, Z, levels=[0])
ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['b', 'r'])[y_noisy])
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(());
ax.set_title("C = %f" % C)
plt.legend()
In [21]:
from sklearn.svm import LinearSVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
X, y = make_blobs(centers=2, random_state=4, n_samples=30)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, axes = plt.subplots(1, 4, figsize=(15, 3))
y_noisy = y.copy()
y_noisy[7] = 0
y_noisy[27] = 0
# restore balance
mask = np.ones(len(X), dtype=np.bool)
mask[np.array([0, 1, 5, 26])] = 0
X = X[mask]
y_noisy = y_noisy[mask]
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
for ax, gamma in zip(axes, [0.1, .5, 1, 10]):
svm = SVC(gamma=gamma, kernel='rbf', C=1, tol=0.00001).fit(X, y_noisy)
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
print(svm.score(X, y_noisy))
Z = Z.reshape(xx.shape)
ax.contour(xx, yy, Z, levels=[0])
ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['b', 'r'])[y_noisy])
ax.set_xlim(x_min, x_max)
ax.set_ylim(y_min, y_max)
ax.set_xticks(())
ax.set_yticks(());
ax.set_title("gamma = %f" % gamma)
plt.legend()
In [22]:
plt.matshow(Z)
Out[22]:
In [23]:
plt.contourf(xx, yy, Z)
Out[23]:
In [23]: